本次作业中,自行选择2个数据集进行探索性分析与预处理。
可选数据集包括:
Consumer & Visitor Insights For Neighborhoods
Wine Reviews (Chosen)
Oakland Crime Statistics 2011 to 2016 (Chosen)
Chicago Building Violations
Trending YouTube Video Statistics
Melbourne Airbnb Open Data
MLB Pitch Data 2015-2018
观察数据集中缺失数据,分析其缺失的原因。分别使用下列四种策略对缺失值进行处理:
注意:在处理后,要可视化地对比新旧数据集。
!pip3 install impyute
Requirement already satisfied: impyute in c:\users\feimo\appdata\local\programs\python\python37\lib\site-packages (0.0.8) Requirement already satisfied: numpy in c:\users\feimo\appdata\local\programs\python\python37\lib\site-packages (from impyute) (1.20.2) Requirement already satisfied: scipy in c:\users\feimo\appdata\local\programs\python\python37\lib\site-packages (from impyute) (1.6.2) Requirement already satisfied: scikit-learn in c:\users\feimo\appdata\local\programs\python\python37\lib\site-packages (from impyute) (0.24.1) Requirement already satisfied: joblib>=0.11 in c:\users\feimo\appdata\local\programs\python\python37\lib\site-packages (from scikit-learn->impyute) (1.0.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\feimo\appdata\local\programs\python\python37\lib\site-packages (from scikit-learn->impyute) (2.1.0)
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import re
import sys
import csv
"""
标称属性,给出每个可能取值的频数
数值属性,给出5数概括及缺失值的个数
"""
# 给出数据的基本信息,判断标称属性和数值属性
def columns_info(df,show_shape=True,new_df=None):
print(df.info())
if show_shape: print(df.shape)
if new_df is not None:
print()
print(new_df.info())
if show_shape: print(new_df.shape)
return None
# 标称属性每个可能取值的频数及直方图
def nominal_summary(df,nominal_index=None,head_n=50,new_df=None):
# 根据频数绘制直方图
def bar_describe(data,new_data=None,head_n=50):
if new_data is None:
plt.figure(figsize=(24,8))
plt.title(data.name,fontsize=30)
plt.bar(data.index[:head_n], data.values[:head_n])
plt.xticks(rotation=90)
plt.show()
else:
plt.figure(figsize=(24,8))
plt.title(data.name,fontsize=30)
plt.bar(data.index[:head_n], data.values[:head_n],color='b',label=data.name)
plt.bar(new_data.index[:head_n],new_data.values[:head_n],color='r',label='new_'+new_data.name)
plt.xticks(rotation=90)
plt.legend()
plt.show()
# 获取数据中每个取值的频数
frequency = {key: df[key].value_counts() for key in df.columns}
new_frequency = None
if new_df is not None:
new_frequency = {key: new_df[key].value_counts() for key in new_df.columns}
if nominal_index is None:
nominal_index=df.columns
if new_frequency is None:
for key in nominal_index:
bar_describe(data=frequency[key],new_data=None,head_n=head_n)
else:
for key in nominal_index:
bar_describe(data=frequency[key],new_data=new_frequency[key],head_n=head_n)
return None
# 数值属性缺失值个数及五数概括
def numerical_summary(df,numerical_index=None,new_df=None):
shape=df.shape
# 格式化输出五数概括和缺失值个数
def data_describe(data,new_data=None):
print('descriptive statistics (%s):' % data.name)
info=data.describe()
new_info = None
if new_data is not None:
new_info = new_data.describe()
print("Min: {:.4f}\tQ1(25%): {:.4f} \tQ2(50%): {:.4f} \tQ3(75%): {:.4f} \tMax: {:.4f}".format(
info['min'],info['25%'],info['50%'],info['75%'],info['max']))
print("Missing: {:d}".format(int(shape[0] - info['count'])))
print()
if new_info is not None:
print("\033[95m new Min: {:.4f}\tQ1(25%): {:.4f} \tQ2(50%): {:.4f} \tQ3(75%): {:.4f} \tMax: {:.4f} \033[0m".format(
new_info['min'],new_info['25%'],new_info['50%'],new_info['75%'],new_info['max']))
print("\033[95m new Missing: {:d} \033[0m".format(int(shape[0] - new_info['count'])))
print()
if numerical_index is None:
print("Please provide the numerical index needed to be describe")
return None
# 获取数值属性的5数概况和缺失值个数
if new_df is None:
for key in numerical_index:
data_describe(df[key])
else:
for key in numerical_index:
data_describe(df[key],new_df[key])
return None
wine_reviews_dir = "Wine"
winemag150k=pd.read_csv(wine_reviews_dir+"/winemag-data_first150k.csv",sep=',',header='infer',index_col=0)
columns_info(winemag150k)
<class 'pandas.core.frame.DataFrame'> Int64Index: 150930 entries, 0 to 150929 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 150925 non-null object 1 description 150930 non-null object 2 designation 105195 non-null object 3 points 150930 non-null int64 4 price 137235 non-null float64 5 province 150925 non-null object 6 region_1 125870 non-null object 7 region_2 60953 non-null object 8 variety 150930 non-null object 9 winery 150930 non-null object dtypes: float64(1), int64(1), object(8) memory usage: 12.7+ MB None (150930, 10)
# Nominal index
nominal_index = ['country', 'designation', 'province', 'region_1', 'region_2', 'variety', 'winery']
# Numerical index
numerical_index = ['points', 'price']
nominal_summary(winemag150k,new_df=None,nominal_index=nominal_index,head_n=50)
numerical_summary(winemag150k,new_df=None,numerical_index=numerical_index)
descriptive statistics (points): Min: 80.0000 Q1(25%): 86.0000 Q2(50%): 88.0000 Q3(75%): 90.0000 Max: 100.0000 Missing: 0 descriptive statistics (price): Min: 4.0000 Q1(25%): 16.0000 Q2(50%): 24.0000 Q3(75%): 40.0000 Max: 2300.0000 Missing: 13695
winemag130k=pd.read_csv(wine_reviews_dir+"/winemag-data-130k-v2.csv",sep=',',header='infer',index_col=0)
columns_info(winemag130k)
<class 'pandas.core.frame.DataFrame'> Int64Index: 129971 entries, 0 to 129970 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 country 129908 non-null object 1 description 129971 non-null object 2 designation 92506 non-null object 3 points 129971 non-null int64 4 price 120975 non-null float64 5 province 129908 non-null object 6 region_1 108724 non-null object 7 region_2 50511 non-null object 8 taster_name 103727 non-null object 9 taster_twitter_handle 98758 non-null object 10 title 129971 non-null object 11 variety 129970 non-null object 12 winery 129971 non-null object dtypes: float64(1), int64(1), object(11) memory usage: 13.9+ MB None (129971, 13)
# Nominal index
nominal_index = ['country','description', 'designation', 'province', 'region_1', 'region_2',
'taster_name','taster_twitter_handle','title',
'variety', 'winery']
# Numerical index
numerical_index = ['points', 'price']
nominal_summary(winemag130k,nominal_index,head_n=50)
numerical_summary(winemag130k,numerical_index)